from googleapiclient.discovery import build
from dateutil import parser
import pandas as pd
from IPython.display import JSON
# Data viz packages
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to [nltk_data] /Users/luqiansong/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] /Users/luqiansong/nltk_data... [nltk_data] Package punkt is already up-to-date!
# load datasets
df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
#getting basic information about datasets
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8000 entries, 0 to 7999 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 8000 non-null int64 1 category 7980 non-null object 2 headline 7987 non-null object 3 authors 6951 non-null object 4 link 7981 non-null object 5 short_description 7150 non-null object 6 date 7982 non-null object dtypes: int64(1), object(6) memory usage: 437.6+ KB
# check the dataset for blank values
df.isnull().sum()
Unnamed: 0 0 category 20 headline 13 authors 1049 link 19 short_description 850 date 18 dtype: int64
# found blank values exist in every columns
df.isnull().any()
Unnamed: 0 False category True headline True authors True link True short_description True date True dtype: bool
# Check data types to validate datatype
df.dtypes
Unnamed: 0 int64 category object headline object authors object link object short_description object date object dtype: object
df.shape
(8000, 7)
# check incorrect/imbalance data thourgh class count
category_counts = df['category'].value_counts()
category_counts.plot(kind='bar')
plt.title("Category Distribution")
plt.xlabel("Category")
plt.ylabel("Count")
plt.show()
import pandas as pd
df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
int_columns = ['category','headline', 'authors', 'short_description', 'date']
import os
import numpy as np
import cv2
import random
import sklearn
import keras
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from keras import backend as K
from keras.layers import AveragePooling2D
from tensorflow.keras.optimizers import RMSprop
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPooling2D, Flatten
from keras.callbacks import ModelCheckpoint
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import tensorflow as tf
2023-07-07 13:37:36.848599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
# analyse the lenght of the sentences in each category
# length: analyse relationship between short_description and category
df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
df['headline_length'] = df['headline'].str.len()
# Group the DataFrame by column 'category' and calculate descriptive statistics for headline length
grouped_lengths = df.groupby('category')['headline_length'].describe()
# Display the descriptive statistics for headline length in each category
display(grouped_lengths)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| category | ||||||||
| POLITICS | 5973.0 | 63.721915 | 17.481447 | 11.0 | 53.0 | 65.0 | 75.0 | 189.0 |
| SPORTS | 1994.0 | 62.472919 | 16.363946 | 10.0 | 53.0 | 64.0 | 72.0 | 109.0 |
# length analyse relationship between headline and category
df['short_description_length'] = df['short_description'].str.len()
# Group the DataFrame by column 'category' and calculate descriptive statistics for headline length
grouped_lengths = df.groupby('category')['short_description_length'].describe()
# Display the descriptive statistics for headline length in each category
display(grouped_lengths)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| category | ||||||||
| POLITICS | 5399.0 | 104.167438 | 76.482309 | 1.0 | 64.0 | 92.0 | 122.5 | 1136.0 |
| SPORTS | 1731.0 | 88.634315 | 61.772038 | 1.0 | 43.0 | 75.0 | 121.0 | 374.0 |
# check outlier according the text length
import pandas as pd
df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
# Convert float values in 'text' column to strings
df['text_length'] = df['short_description'].astype(str)
# Calculate the length of each text sample
df['text_length'] = df['short_description'].apply(lambda x: len(str(x)))
# Calculate the mean and standard deviation of text lengths
mean_length = df['text_length'].mean()
std_length = df['text_length'].std()
# Define a threshold for outlier detection
threshold = mean_length + 3 * std_length
# Identify outliers based on text length
outliers = df[df['text_length'] > threshold]
outliers
| Unnamed: 0 | category | headline | authors | link | short_description | date | text_length | |
|---|---|---|---|---|---|---|---|---|
| 3 | 3 | POLITICS | An Open Letter to My Fellow Millennials on Hil... | Nick Laure, ContributorAn advocate for logical... | https://www.huffingtonpost.com/entry/an-open-l... | I am not asking anyone to stop supporting Bern... | 2016-03-20 | 318 |
| 59 | 59 | POLITICS | Can You Catch It From a Caliph? Ebola, ISIS, ... | M. Gregg Bloche, M.D., J.D., ContributorAuthor... | https://www.huffingtonpost.com/entry/can-you-c... | Two potent forces power the Ebola and ISIS epi... | 2014-10-02 | 318 |
| 72 | 72 | POLITICS | In Defense of Christians | James Zogby, ContributorPresident, Arab Americ... | https://www.huffingtonpost.com/entry/in-defens... | We have every reason to be concerned with the ... | 2014-09-06 | 345 |
| 238 | 238 | POLITICS | Democrats Should Take the Megalomaniac Seriously | Dave R. Jacobson, ContributorDemocratic Strate... | https://www.huffingtonpost.com/entry/democrats... | "I'm a unifier," said Donald Trump, the odds-o... | 2016-03-13 | 334 |
| 540 | 540 | POLITICS | Sunday Roundup | Arianna Huffington, Contributor | https://www.huffingtonpost.com/entry/sunday-ro... | LONDON -- This week began with the continuing ... | 2014-06-01 | 933 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7685 | 7685 | POLITICS | Adam Smith vs. Ayn Rand | David Morris, ContributorDirector, The Public ... | https://www.huffingtonpost.com/entry/adam-smit... | Even with Medicare and Medicaid, tens of milli... | 2015-06-02 | 324 |
| 7777 | 7777 | POLITICS | Sunday Roundup | Arianna Huffington, Contributor | https://www.huffingtonpost.com/entry/sunday-ro... | This week, the nation watched Kobe Bryant say ... | 2016-04-17 | 1067 |
| 7820 | 7820 | POLITICS | Social Security Expansion Key to Averting Reti... | Ben Veghte, ContributorVice President for Poli... | https://www.huffingtonpost.com/entry/social-se... | Workers today need to be saving much more for ... | 2015-06-03 | 363 |
| 7840 | 7840 | POLITICS | Sunday Roundup | Arianna Huffington, Contributor | https://www.huffingtonpost.com/entry/sunday-ro... | This week, the nation was once again shocked b... | 2015-10-04 | 898 |
| 7897 | 7897 | POLITICS | The Three Stooges of the Grand Obstructionist ... | Lance Simmens, ContributorAuthor, "Fracktured"... | https://www.huffingtonpost.com/entry/the-three... | Dick Cheney, Bill O'Reilly, and Rudy Giuliani ... | 2015-02-22 | 329 |
86 rows × 8 columns
import matplotlib.pyplot as plt
# Plot histogram of text lengths
plt.hist(df['text_length'], bins=50)
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.title('Distribution of Text Lengths')
plt.show()
# Plot boxplot of text lengths
plt.boxplot(df['text_length'])
plt.xlabel('Text Length')
plt.title('Boxplot of Text Lengths')
plt.show()
# check incorrect/imbalance data thourgh class distribution
import matplotlib.pyplot as plt
# Calculate class distribution
class_distribution = df['category'].value_counts(normalize=True)
# Plot class distribution
plt.bar(class_distribution.index, class_distribution.values)
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Class Distribution')
plt.show()
# Calculate class counts
class_counts = df['category'].value_counts()
# Check Value Counts and Unique Values also check for logical inconsistencies
value_counts = df['category'].value_counts()
unique_values = df['category'].unique()
value_counts
category POLITICS 5983 SPORTS 1997 Name: count, dtype: int64
eed
unique_values
array(['POLITICS', 'SPORTS', nan], dtype=object)
# I found unnamed column , removed it
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
# Analyse the infomation of short_description
df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
from wordcloud import WordCloud
# Convert float values in 'short_description' column to strings
df['short_description'] = df['short_description'].astype(str)
# Join the different processed descriptions together
long_string = ' '.join(list(df['short_description'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color='white', max_words=5000, contour_width=3, contour_color='steelblue')
wordcloud.generate(long_string)
# Display the word cloud
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
df['desc_lemmatized'] = df['short_description'].str.replace('[^\w\s]','')
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_48720/2591372573.py:1: FutureWarning: The default value of regex will change from True to False in a future version.
df['desc_lemmatized'] = df['short_description'].str.replace('[^\w\s]','')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
new_words = {
'cpos': -3.0,
'cneg': 3.0,
}
analyser = SentimentIntensityAnalyzer()
analyser.lexicon.update(new_words)
scores=[]
for i in range(len(df['desc_lemmatized'])):
score = analyser.polarity_scores(df['desc_lemmatized'][i])
score=score['compound']
scores.append(score)
sentiment=[]
for i in scores:
if i>=0.75:
sentiment.append('Overly Positive')
elif (i>=0.05) and (i<0.75):
sentiment.append('Positive')
elif i<=(-0.75):
sentiment.append('Overly Negative')
elif (i<=-0.05) and (i>-0.75):
sentiment.append('Negative')
else:
sentiment.append('Neutral')
df['sentiment']= pd.Series(np.array(sentiment))
df['score']= pd.Series(np.array(scores))
df.groupby(by="sentiment").mean()
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_48720/4284074795.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
| Unnamed: 0 | score | |
|---|---|---|
| sentiment | ||
| Negative | 4008.986454 | -0.402634 |
| Neutral | 4003.686468 | 0.000086 |
| Overly Negative | 4112.705674 | -0.833058 |
| Overly Positive | 3949.543974 | 0.829630 |
| Positive | 3977.015938 | 0.418560 |
df.head()
| Unnamed: 0 | category | headline | authors | link | short_description | date | sentiment | desc_lemmatized | score | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | POLITICS | Watch Highlights From The Last GOP Debate Befo... | Amber Ferguson | https://www.huffingtonpost.com/entry/new-hamps... | Marco Rubio had a rough night. | 2016-02-07 | Neutral | Marco Rubio had a rough night | 0.0000 |
| 1 | 1 | SPORTS | Bob Costas And His Fedora Are 'Thursday Night ... | Lee Moran | https://www.huffingtonpost.com/entry/bob-costa... | "Bob Costas' hat just got its own Martin Scors... | 2016-12-23 | Neutral | Bob Costas hat just got its own Martin Scorses... | 0.0000 |
| 2 | 2 | POLITICS | Hillary Clinton Met Privately With Elizabeth W... | NaN | https://www.huffingtonpost.com/entry/hillary-c... | nan | 2015-02-17 | Neutral | nan | 0.0000 |
| 3 | 3 | POLITICS | An Open Letter to My Fellow Millennials on Hil... | Nick Laure, ContributorAn advocate for logical... | https://www.huffingtonpost.com/entry/an-open-l... | I am not asking anyone to stop supporting Bern... | 2016-03-20 | Overly Positive | I am not asking anyone to stop supporting Bern... | 0.9136 |
| 4 | 4 | POLITICS | Key California Lawmaker Steps Down Amid Harass... | Mollie Reilly | https://www.huffingtonpost.com/entry/raul-boca... | The state assemblyman announced Monday he'll r... | 2017-11-21 | Overly Negative | The state assemblyman announced Monday hell re... | -0.7906 |
import pandas as pd
import jinja2
plt.figure(figsize=(12,6))
sns.countplot(x='sentiment',data=df)
fig = go.Figure(go.Funnelarea(
text =temp.sentiment,
values = temp.desc_lemmatized,
title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
))
fig.show()
from sklearn.feature_selection import mutual_info_classif
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# Load your DataFrame
frame = pd.read_csv("/Users/luqiansong/Desktop/22201381.csv")
# One-hot encode the "category" and "authors" columns
df_encoded = pd.get_dummies(frame, columns=["authors", "date","category"])
# Remove non-numeric columns (headline, link, short_description, and date)
non_numeric_columns = ["headline", "link", "short_description", "date"]
df_encoded = df_encoded.drop(columns=non_numeric_columns, errors='ignore')
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_encoded)
# Perform PCA
pca = PCA()
components = pca.fit_transform(data_scaled)
components
array([[-8.69705664e-01, 6.26427880e-01, 7.22034372e-03, ...,
3.58492191e-16, 1.88440731e-16, -1.14092553e-18],
[ 1.95225394e+00, 5.29834071e-01, 9.34774259e-03, ...,
1.41763971e-17, -6.35970648e-17, -3.07950819e-20],
[-9.98525823e-01, 7.76308279e-01, 4.16426205e-03, ...,
5.57767221e-18, 3.21935501e-17, -1.34339582e-20],
...,
[ 3.32528900e+00, -7.16890127e-02, 5.15406559e-03, ...,
1.28225445e-17, 2.94217825e-17, -2.80420349e-20],
[-7.73387055e-01, -6.20599006e-01, 8.75271764e-03, ...,
-1.32933787e-17, 3.74208307e-17, 9.37633396e-20],
[-7.79325944e-01, -1.13166481e+00, 7.75873425e-03, ...,
3.98750236e-18, 5.44806642e-18, -5.19434845e-21]])
from pandas import DataFrame as df
# Get the explained variance ratio to understand the contribution of each principal component
explained_variance = pca.explained_variance_ratio_
# Print the explained variance ratio for each principal component
print("Explained Variance Ratio for Principal Components:")
print(explained_variance)
Explained Variance Ratio for Principal Components: [9.64377190e-04 6.13430832e-04 5.98622072e-04 ... 3.11243690e-36 1.95600636e-36 7.42489189e-42]
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_classif
#Set our X and y
X = frame.loc[:,"authors":"date"]
y = frame.loc[:,"category"]
X
| authors | link | short_description | date | |
|---|---|---|---|---|
| 0 | Amber Ferguson | https://www.huffingtonpost.com/entry/new-hamps... | Marco Rubio had a rough night. | 2016-02-07 |
| 1 | Lee Moran | https://www.huffingtonpost.com/entry/bob-costa... | "Bob Costas' hat just got its own Martin Scors... | 2016-12-23 |
| 2 | NaN | https://www.huffingtonpost.com/entry/hillary-c... | NaN | 2015-02-17 |
| 3 | Nick Laure, ContributorAn advocate for logical... | https://www.huffingtonpost.com/entry/an-open-l... | I am not asking anyone to stop supporting Bern... | 2016-03-20 |
| 4 | Mollie Reilly | https://www.huffingtonpost.com/entry/raul-boca... | The state assemblyman announced Monday he'll r... | 2017-11-21 |
| ... | ... | ... | ... | ... |
| 7995 | NaN | https://www.huffingtonpost.comhttp://www.nytim... | Hillary Clinton’s advisers and allies have beg... | 2016-04-23 |
| 7996 | Travis Waldron | https://www.huffingtonpost.com/entry/raiders-n... | Gambling advocates believe the NFL's embrace o... | 2017-03-30 |
| 7997 | Chris Greenberg | https://www.huffingtonpost.com/entry/giants-wo... | NaN | 2014-10-30 |
| 7998 | Chris D'Angelo | https://www.huffingtonpost.com/entry/trump-lif... | The decision was made public by none other tha... | 2017-11-16 |
| 7999 | The Trace, Editorial Partner | https://www.huffingtonpost.com/entry/states-se... | After talking to New Jersey’s compensation off... | 2018-02-12 |
8000 rows × 4 columns
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
# Convert the target variable (y) to numeric format using label encoding
label_encoder = LabelEncoder()
y_numeric = label_encoder.fit_transform(y)
# Convert the feature columns (X) to numeric format using one-hot encoding
one_hot_encoder = OneHotEncoder()
X_numeric = one_hot_encoder.fit_transform(X)
# Calculate mutual information scores
i_scores = mutual_info_classif(X_numeric, y_numeric)
output = list(zip(frame.columns,i_scores))
i_frame = df(output,columns=["Feature","I-gain"])
i_frame
| Feature | I-gain | |
|---|---|---|
| 0 | Unnamed: 0 | 0.000036 |
| 1 | category | 0.000036 |
| 2 | headline | 0.000036 |
| 3 | authors | 0.000173 |
| 4 | link | 0.000036 |
| 5 | short_description | 0.000173 |
| 6 | date | 0.000036 |
i_frame = i_frame.sort_values(by=['I-gain'],ascending=False)
i_frame = i_frame.reset_index()
i_frame
| index | Feature | I-gain | |
|---|---|---|---|
| 0 | 3 | authors | 0.000173 |
| 1 | 5 | short_description | 0.000173 |
| 2 | 0 | Unnamed: 0 | 0.000036 |
| 3 | 1 | category | 0.000036 |
| 4 | 2 | headline | 0.000036 |
| 5 | 4 | link | 0.000036 |
| 6 | 6 | date | 0.000036 |
#i_frame = i_frame.drop("index",axis=1)
i_frame = i_frame.set_index("Feature")
i_frame.plot.bar()
<Axes: xlabel='Feature'>
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
complete_data = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
data = complete_data[['short_description','category']]
data
| short_description | category | |
|---|---|---|
| 0 | Marco Rubio had a rough night. | POLITICS |
| 1 | "Bob Costas' hat just got its own Martin Scors... | SPORTS |
| 2 | NaN | POLITICS |
| 3 | I am not asking anyone to stop supporting Bern... | POLITICS |
| 4 | The state assemblyman announced Monday he'll r... | POLITICS |
| ... | ... | ... |
| 7995 | Hillary Clinton’s advisers and allies have beg... | POLITICS |
| 7996 | Gambling advocates believe the NFL's embrace o... | SPORTS |
| 7997 | NaN | SPORTS |
| 7998 | The decision was made public by none other tha... | POLITICS |
| 7999 | After talking to New Jersey’s compensation off... | POLITICS |
8000 rows × 2 columns
data['short_description'].isnull().sum()
850
data['category'].value_counts()
POLITICS 5983 SPORTS 1997 Name: category, dtype: int64
positive_class = data[data['category']=='SPORTS']['short_description']
negative_class = data[data['category']=='POLITICS']['short_description']
negative_class
0 Marco Rubio had a rough night.
2 NaN
3 I am not asking anyone to stop supporting Bern...
4 The state assemblyman announced Monday he'll r...
6 Cross-posted with TomDispatch.com Since 9/11, ...
...
7993 The state had been banned from hosting NCAA ga...
7994 Education policy is not immune to fake news.
7995 Hillary Clinton’s advisers and allies have beg...
7998 The decision was made public by none other tha...
7999 After talking to New Jersey’s compensation off...
Name: short_description, Length: 5983, dtype: object
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
positive_class = positive_class.fillna('')
words_in_positive_class = vectorizer.fit_transform(positive_class)
tokens_and_counts = zip(vectorizer.get_feature_names(), np.asarray(words_in_positive_class.sum(axis=0)).ravel())
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead. warnings.warn(msg, category=FutureWarning)
df_tokens = pd.DataFrame(tokens_and_counts, columns=['Token', 'Count'])
df_tokens
| Token | Count | |
|---|---|---|
| 0 | 000 | 6 |
| 1 | 002 | 1 |
| 2 | 029 | 1 |
| 3 | 04 | 1 |
| 4 | 09 | 1 |
| ... | ... | ... |
| 5453 | zealand | 1 |
| 5454 | zero | 2 |
| 5455 | zika | 3 |
| 5456 | zone | 1 |
| 5457 | zurich | 1 |
5458 rows × 2 columns
df_tokens.sort_values("Count", ascending=False, inplace=True)
df_tokens.reset_index(inplace=True, drop=True)
df_tokens
| Token | Count | |
|---|---|---|
| 0 | game | 96 |
| 1 | just | 80 |
| 2 | team | 73 |
| 3 | like | 69 |
| 4 | nfl | 65 |
| ... | ... | ... |
| 5453 | gym | 1 |
| 5454 | gymnasts | 1 |
| 5455 | gyms | 1 |
| 5456 | hacking | 1 |
| 5457 | zurich | 1 |
5458 rows × 2 columns
most_popular_tokens = df_tokens.nlargest(columns="Count", n=15)
most_popular_tokens
| Token | Count | |
|---|---|---|
| 0 | game | 96 |
| 1 | just | 80 |
| 2 | team | 73 |
| 3 | like | 69 |
| 4 | nfl | 65 |
| 5 | football | 63 |
| 6 | said | 62 |
| 7 | sports | 61 |
| 8 | time | 60 |
| 9 | new | 55 |
| 10 | year | 54 |
| 11 | win | 48 |
| 12 | players | 48 |
| 13 | league | 45 |
| 14 | season | 44 |
least_popular_tokens = df_tokens.nsmallest(columns="Count", n=15)
least_popular_tokens
| Token | Count | |
|---|---|---|
| 2154 | spanning | 1 |
| 2155 | phalange | 1 |
| 2156 | pickup | 1 |
| 2157 | weapon | 1 |
| 2158 | pita | 1 |
| 2159 | spartans | 1 |
| 2160 | weaknesses | 1 |
| 2161 | pitched | 1 |
| 2162 | wbz | 1 |
| 2163 | philip | 1 |
| 2164 | southpaw | 1 |
| 2165 | wbc | 1 |
| 2166 | warrant | 1 |
| 2167 | phelps | 1 |
| 2168 | sweden | 1 |
fig, axes = plt.subplots(2, 1, figsize=(20,8))
sns.barplot(ax=axes[0], data=least_popular_tokens, x="Token", y ="Count")
sns.barplot(ax=axes[1], data=most_popular_tokens, x="Token", y ="Count")
axes[0].set(ylabel='Counts', xlabel="Tokens", title="%d Least Frequent Tokens After Stop Word Removal" % 20 )
axes[1].set(ylabel='Counts', xlabel="Tokens", title="%d Most Frequent Tokens After Stop Word Removal" % 20 )
plt.tight_layout()
# as shown above, the 20 most frequent words are:spanning, phalange, pickup, weapon, pita, spartans, weaknesses, pithced,wbs, philip, southpaw, wbc, warrant, phelps, sweden
data
| short_description | category | |
|---|---|---|
| 0 | Marco Rubio had a rough night. | POLITICS |
| 1 | "Bob Costas' hat just got its own Martin Scors... | SPORTS |
| 2 | NaN | POLITICS |
| 3 | I am not asking anyone to stop supporting Bern... | POLITICS |
| 4 | The state assemblyman announced Monday he'll r... | POLITICS |
| ... | ... | ... |
| 7995 | Hillary Clinton’s advisers and allies have beg... | POLITICS |
| 7996 | Gambling advocates believe the NFL's embrace o... | SPORTS |
| 7997 | NaN | SPORTS |
| 7998 | The decision was made public by none other tha... | POLITICS |
| 7999 | After talking to New Jersey’s compensation off... | POLITICS |
8000 rows × 2 columns
import pandas as pd
data.dropna(inplace=True)
X = data['short_description']
y = data['category']# Split the dataset
X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.30, train_size = 0.7, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size = 0.199/0.7, train_size = 0.5/0.7, stratify=y_train_plus_valid)
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_34631/2359622052.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data.dropna(inplace=True)
type(X_valid)
pandas.core.series.Series
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
(3565,) (1419,) (2139,)
# Generating a bag-of-words model
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
CountVectorizer(stop_words='english')
X_train_matrix = vectorizer.transform(X_train)
X_valid_matrix = vectorizer.transform(X_valid)
X_test_matrix = vectorizer.transform(X_test)
print(X_train_matrix.shape)
print(X_valid_matrix.shape)
print(X_test_matrix.shape)
(3565, 8958) (1419, 8958) (2139, 8958)
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train_matrix, y_train)
KNeighborsClassifier(n_neighbors=1)
# #
X_cm = X_train_matrix
y_true_labels = y_train
model = neigh
# Apply trained model to new dataset
y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))
cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix Training Set');
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
precision recall f1-score support
POLITICS 1.00 1.00 1.00 2699
SPORTS 0.99 1.00 0.99 866
accuracy 1.00 3565
macro avg 1.00 1.00 1.00 3565
weighted avg 1.00 1.00 1.00 3565
X_cm = X_valid_matrix
y_true_labels = y_valid
model = neigh
# Apply trained model to new dataset
y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))
cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix Validation Set');
precision recall f1-score support
POLITICS 0.77 0.89 0.82 1075
SPORTS 0.32 0.17 0.22 344
accuracy 0.71 1419
macro avg 0.55 0.53 0.52 1419
weighted avg 0.66 0.71 0.68 1419
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning. mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
# y_pred is a numpy array. It needs to be converted.
y_pred.shape
(1419,)
print(len(X_valid))
print(len(y_valid))
print(len(y_pred))
1419 1419 1419
y_pred = pd.DataFrame(y_pred, index=X_valid.index, columns=['y_pred'])
valid_X_y = pd.concat([X_valid,y_valid,y_pred], axis=1)
valid_X_y
| short_description | category | y_pred | |
|---|---|---|---|
| 871 | R.I.P. Omer Asik. | SPORTS | POLITICS |
| 7471 | The 2016 election was a stunning blow to the m... | POLITICS | POLITICS |
| 1400 | "They’re gonna start some s**t." | POLITICS | POLITICS |
| 7893 | A day after a gunman opened fire on a practice... | POLITICS | POLITICS |
| 2911 | So far, the ads look nothing like NASCAR. | SPORTS | POLITICS |
| ... | ... | ... | ... |
| 136 | To say he doesn't really care. | SPORTS | POLITICS |
| 65 | A Quinnipiac poll found that 59 percent of wom... | POLITICS | POLITICS |
| 6094 | Holly Rowe of ESPN showed exactly what society... | SPORTS | POLITICS |
| 2874 | American Lindsey Vonn missed the podium with a... | SPORTS | POLITICS |
| 180 | Ben Carson, the retired neurosurgeon who brief... | POLITICS | POLITICS |
1419 rows × 3 columns
# split the dataset with a 70-15-15 ratio for the training, validation, and test sets, respectively
from sklearn.model_selection import train_test_split
# Split the dataset into training, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
# Save the datasets as separate CSV files
train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)
test_df.to_csv('test.csv', index=False)
import pandas as pd
# Load train.csv
train_data = pd.read_csv('/Users/luqiansong/Desktop/train.csv')
# Load valid.csv
valid_data = pd.read_csv('/Users/luqiansong/Desktop/valid.csv')
# check train_data info
train_data.head()
| category | headline | authors | link | short_description | date | headline_length | short_description_length | |
|---|---|---|---|---|---|---|---|---|
| 0 | SPORTS | Jack Sock Retires From U.S. Open Match After S... | NaN | https://www.huffingtonpost.com/entry/jack-sock... | American Jack Sock was overcome by the heat an... | 2015-09-03 | 70 | 119.0 |
| 1 | POLITICS | Former Mexican President Vicente Fox Issues St... | Lee Moran | https://www.huffingtonpost.com/entry/vicente-f... | "You better speak up, because this guy is taki... | 2018-04-06 | 85 | 65.0 |
| 2 | POLITICS | Kurds and US vs ISIS | Ryan Campbell, ContributorEditor at DRM Capito... | https://www.huffingtonpost.com/entry/kurds-and... | Allying with Kurdish forces means we can bette... | 2014-11-19 | 20 | 206.0 |
| 3 | POLITICS | Anthony Scaramucci, We Hardly Knew Ye | Marina Fang | https://www.huffingtonpost.com/entry/anthony-s... | 10 highlights from the Mooch's 10 days as Whit... | 2017-07-31 | 37 | 78.0 |
| 4 | SPORTS | Super Bowl Commercials 2014: Watch All Ads Air... | Chris Greenberg | https://www.huffingtonpost.com/entry/super-bow... | CLICK HERE to watch the 50 Greatest Super Bowl... | 2014-02-02 | 90 | 127.0 |
#All LowerCase for train_data
import sys,csv,re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def converter(x):
try:
return ' '.join([x.lower() for x in str(x).split() if x not in stop_words])
except AttributeError:
return None
train_data['short_description'] = train_data['short_description'].apply(converter)
# check train_data lowercase situation
train_data.head()
| category | headline | authors | link | short_description | date | headline_length | short_description_length | |
|---|---|---|---|---|---|---|---|---|
| 0 | SPORTS | Jack Sock Retires From U.S. Open Match After S... | NaN | https://www.huffingtonpost.com/entry/jack-sock... | american jack sock overcome heat retire fourth... | 2015-09-03 | 70 | 119.0 |
| 1 | POLITICS | Former Mexican President Vicente Fox Issues St... | Lee Moran | https://www.huffingtonpost.com/entry/vicente-f... | "you better speak up, guy taking nowhere." | 2018-04-06 | 85 | 65.0 |
| 2 | POLITICS | Kurds and US vs ISIS | Ryan Campbell, ContributorEditor at DRM Capito... | https://www.huffingtonpost.com/entry/kurds-and... | allying kurdish forces means better fight isis... | 2014-11-19 | 20 | 206.0 |
| 3 | POLITICS | Anthony Scaramucci, We Hardly Knew Ye | Marina Fang | https://www.huffingtonpost.com/entry/anthony-s... | 10 highlights mooch's 10 days white house comm... | 2017-07-31 | 37 | 78.0 |
| 4 | SPORTS | Super Bowl Commercials 2014: Watch All Ads Air... | Chris Greenberg | https://www.huffingtonpost.com/entry/super-bow... | click here watch 50 greatest super bowl commer... | 2014-02-02 | 90 | 127.0 |
#Removing Punctuation in train_data
train_data['description_punc'] =train_data['short_description'].str.replace('[^\w\s]','')
#Removal of stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
train_data['description_stop'] = train_data['description_punc'] .apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train_data.head()
[nltk_data] Downloading package stopwords to [nltk_data] /Users/luqiansong/nltk_data... [nltk_data] Package stopwords is already up-to-date!
| category | headline | authors | link | short_description | date | headline_length | short_description_length | description_punc | description_stop | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SPORTS | Jack Sock Retires From U.S. Open Match After S... | NaN | https://www.huffingtonpost.com/entry/jack-sock... | american jack sock overcome heat retire fourth... | 2015-09-03 | 70 | 119.0 | american jack sock overcome heat retire fourth... | american jack sock overcome heat retire fourth... |
| 1 | POLITICS | Former Mexican President Vicente Fox Issues St... | Lee Moran | https://www.huffingtonpost.com/entry/vicente-f... | "you better speak up, guy taking nowhere." | 2018-04-06 | 85 | 65.0 | "you better speak up, guy taking nowhere." | "you better speak up, guy taking nowhere." |
| 2 | POLITICS | Kurds and US vs ISIS | Ryan Campbell, ContributorEditor at DRM Capito... | https://www.huffingtonpost.com/entry/kurds-and... | allying kurdish forces means better fight isis... | 2014-11-19 | 20 | 206.0 | allying kurdish forces means better fight isis... | allying kurdish forces means better fight isis... |
| 3 | POLITICS | Anthony Scaramucci, We Hardly Knew Ye | Marina Fang | https://www.huffingtonpost.com/entry/anthony-s... | 10 highlights mooch's 10 days white house comm... | 2017-07-31 | 37 | 78.0 | 10 highlights mooch's 10 days white house comm... | 10 highlights mooch's 10 days white house comm... |
| 4 | SPORTS | Super Bowl Commercials 2014: Watch All Ads Air... | Chris Greenberg | https://www.huffingtonpost.com/entry/super-bow... | click here watch 50 greatest super bowl commer... | 2014-02-02 | 90 | 127.0 | click here watch 50 greatest super bowl commer... | click watch 50 greatest super bowl commercials... |
#Tokenization
import textblob
from textblob import TextBlob
def tokenization(short_description):
short_description = re.split('\W+', short_description)
return short_description
train_data['description_tokenized'] = train_data['description_stop'].apply(lambda x: tokenization(x.lower()))
#Lemmatization is a more effective option than stemming because it converts the word into its root word,
#rather than just stripping the suffices.
#nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
def lemmatizer(short_description):
short_description = [wn.lemmatize(word) for word in short_description]
return short_description
train_data['description_lemmatized'] = train_data['description_tokenized'].apply(lambda x: lemmatizer(x))
# check all the transformation to the train data
train_data[['short_description', 'description_punc', 'description_tokenized','description_stop','description_lemmatized']][0:9]
| short_description | description_punc | description_tokenized | description_stop | description_lemmatized | |
|---|---|---|---|---|---|
| 0 | american jack sock overcome heat retire fourth... | american jack sock overcome heat retire fourth... | [american, jack, sock, overcome, heat, retire,... | american jack sock overcome heat retire fourth... | [american, jack, sock, overcome, heat, retire,... |
| 1 | "you better speak up, guy taking nowhere." | "you better speak up, guy taking nowhere." | [, you, better, speak, up, guy, taking, nowher... | "you better speak up, guy taking nowhere." | [, you, better, speak, up, guy, taking, nowher... |
| 2 | allying kurdish forces means better fight isis... | allying kurdish forces means better fight isis... | [allying, kurdish, forces, means, better, figh... | allying kurdish forces means better fight isis... | [allying, kurdish, force, mean, better, fight,... |
| 3 | 10 highlights mooch's 10 days white house comm... | 10 highlights mooch's 10 days white house comm... | [10, highlights, mooch, s, 10, days, white, ho... | 10 highlights mooch's 10 days white house comm... | [10, highlight, mooch, s, 10, day, white, hous... |
| 4 | click here watch 50 greatest super bowl commer... | click here watch 50 greatest super bowl commer... | [click, watch, 50, greatest, super, bowl, comm... | click watch 50 greatest super bowl commercials... | [click, watch, 50, greatest, super, bowl, comm... |
| 5 | here's hoping dirt winter. | here's hoping dirt winter. | [here, s, hoping, dirt, winter, ] | here's hoping dirt winter. | [here, s, hoping, dirt, winter, ] |
| 6 | a visit trump tower reveals donald trump famil... | a visit trump tower reveals donald trump famil... | [visit, trump, tower, reveals, donald, trump, ... | visit trump tower reveals donald trump family ... | [visit, trump, tower, reveals, donald, trump, ... |
| 7 | so might time rethink administration's refugee... | so might time rethink administration's refugee... | [might, time, rethink, administration, s, refu... | might time rethink administration's refugee po... | [might, time, rethink, administration, s, refu... |
| 8 | hillary clinton's testimony mostly confirmed p... | hillary clinton's testimony mostly confirmed p... | [hillary, clinton, s, testimony, mostly, confi... | hillary clinton's testimony mostly confirmed p... | [hillary, clinton, s, testimony, mostly, confi... |
train_data.drop(columns=['description_punc', 'description_tokenized', 'description_stop','headline_length','short_description_length'])
| category | headline | authors | link | short_description | date | description_lemmatized | |
|---|---|---|---|---|---|---|---|
| 0 | SPORTS | Jack Sock Retires From U.S. Open Match After S... | NaN | https://www.huffingtonpost.com/entry/jack-sock... | american jack sock overcome heat retire fourth... | 2015-09-03 | [american, jack, sock, overcome, heat, retire,... |
| 1 | POLITICS | Former Mexican President Vicente Fox Issues St... | Lee Moran | https://www.huffingtonpost.com/entry/vicente-f... | "you better speak up, guy taking nowhere." | 2018-04-06 | [, you, better, speak, up, guy, taking, nowher... |
| 2 | POLITICS | Kurds and US vs ISIS | Ryan Campbell, ContributorEditor at DRM Capito... | https://www.huffingtonpost.com/entry/kurds-and... | allying kurdish forces means better fight isis... | 2014-11-19 | [allying, kurdish, force, mean, better, fight,... |
| 3 | POLITICS | Anthony Scaramucci, We Hardly Knew Ye | Marina Fang | https://www.huffingtonpost.com/entry/anthony-s... | 10 highlights mooch's 10 days white house comm... | 2017-07-31 | [10, highlight, mooch, s, 10, day, white, hous... |
| 4 | SPORTS | Super Bowl Commercials 2014: Watch All Ads Air... | Chris Greenberg | https://www.huffingtonpost.com/entry/super-bow... | click here watch 50 greatest super bowl commer... | 2014-02-02 | [click, watch, 50, greatest, super, bowl, comm... |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 5595 | POLITICS | U.S. Military Cancels Hearing For 9/11 Suspects | NaN | https://www.huffingtonpost.com/entry/military-... | another setback government efforts try five me... | 2015-08-16 | [another, setback, government, effort, try, fi... |
| 5596 | POLITICS | Repealing Obamacare Is A Trap For The GOP, Chu... | Michael McAuliff | https://www.huffingtonpost.com/entry/chuck-sch... | "they regret day it.” | 2016-11-22 | [, they, regret, day, it, ] |
| 5597 | SPORTS | Top-Tier Gymnast Maggie Nichols Says Larry Nas... | Alanna Vagianos | https://www.huffingtonpost.com/entry/gymnast-m... | nichols wrote statement first alert usa gymnas... | 2018-01-09 | [nichols, wrote, statement, first, alert, usa,... |
| 5598 | POLITICS | Why Scott Walker Will Never Be President | NaN | https://www.huffingtonpost.com/entry/scott-wal... | nan | 2014-06-21 | [nan] |
| 5599 | POLITICS | Paul Ryan Just Got The Sweetest Deal In Congress | Zach Carter | https://www.huffingtonpost.com/entry/paul-ryan... | and democrats thrilled. | 2015-10-23 | [democrat, thrilled, ] |
5600 rows × 7 columns
valid_data = pd.read_csv('/Users/luqiansong/Desktop/valid.csv')
# check text_data info
valid_data.head()
| category | headline | authors | link | short_description | date | headline_length | short_description_length | |
|---|---|---|---|---|---|---|---|---|
| 0 | POLITICS | HuffPost Rise: What You Need To Know On April 21 | NaN | https://www.huffingtonpost.com/entry/huffpost-... | Welcome to the HuffPost Rise Morning Newsbrief... | 2016-04-21 | 48 | 103.0 |
| 1 | POLITICS | How The Thomas Fire Could Affect An Already St... | Antonia Blumberg | https://www.huffingtonpost.com/entry/southern-... | "I don’t think Ventura County is well-position... | 2017-12-12 | 73 | 83.0 |
| 2 | SPORTS | Michael Phelps To U.S. Olympic Committee: Do S... | Jenna Amatulli | https://www.huffingtonpost.com/entry/michael-p... | Most athletes experience post-Olympics depress... | 2018-04-02 | 83 | 84.0 |
| 3 | POLITICS | HUFFPOST HILL - Better Call Lanny | Eliot Nelson | https://www.huffingtonpost.com/entry/huffpost-... | NaN | 2014-09-17 | 33 | NaN |
| 4 | POLITICS | Trump Booed At Davos For Criticizing 'Fake' Media | Marina Fang | https://www.huffingtonpost.com/entry/trump-dav... | The president dismissed reports that he ordere... | 2018-01-26 | 49 | 108.0 |
train_data.to_csv('train_Cleaned.csv')
# doing transformation on valid data
# Load valid.csv
valid_data = pd.read_csv('/Users/luqiansong/Desktop/valid.csv')
#All LowerCase for train_data
import sys,csv,re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def converter(x):
try:
return ' '.join([x.lower() for x in str(x).split() if x not in stop_words])
except AttributeError:
return None
valid_data['short_description'] = valid_data['short_description'].apply(converter)
#Removing Punctuation in train_data
valid_data['description_punc'] =valid_data['short_description'].str.replace('[^\w\s]','')
#Removal of stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
valid_data['description_stop'] = valid_data['description_punc'] .apply(lambda x: " ".join(x for x in x.split() if x not in stop))
[nltk_data] Downloading package stopwords to [nltk_data] /Users/luqiansong/nltk_data... [nltk_data] Package stopwords is already up-to-date!
#Tokenization
import textblob
from textblob import TextBlob
def tokenization(short_description):
short_description = re.split('\W+', short_description)
return short_description
valid_data['description_tokenized'] = valid_data['description_stop'].apply(lambda x: tokenization(x.lower()))
#Lemmatization
#nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
def lemmatizer(short_description):
short_description = [wn.lemmatize(word) for word in short_description]
return short_description
valid_data['description_lemmatized'] = valid_data['description_tokenized'].apply(lambda x: lemmatizer(x))
# check all the transformation to the valid data
valid_data[['short_description', 'description_punc', 'description_tokenized','description_stop','description_lemmatized']][0:9]
| short_description | description_punc | description_tokenized | description_stop | description_lemmatized | |
|---|---|---|---|---|---|
| 0 | welcome huffpost rise morning newsbrief, short... | welcome huffpost rise morning newsbrief, short... | [welcome, huffpost, rise, morning, newsbrief, ... | welcome huffpost rise morning newsbrief, short... | [welcome, huffpost, rise, morning, newsbrief, ... |
| 1 | "i don’t think ventura county well-positioned ... | "i don’t think ventura county well-positioned ... | [, i, don, t, think, ventura, county, well, po... | "i don’t think ventura county well-positioned ... | [, i, don, t, think, ventura, county, well, po... |
| 2 | most athletes experience post-olympics depress... | most athletes experience post-olympics depress... | [athletes, experience, post, olympics, depress... | athletes experience post-olympics depression, ... | [athlete, experience, post, olympics, depressi... |
| 3 | nan | nan | [nan] | nan | [nan] |
| 4 | the president dismissed reports ordered firing... | the president dismissed reports ordered firing... | [president, dismissed, reports, ordered, firin... | president dismissed reports ordered firing spe... | [president, dismissed, report, ordered, firing... |
| 5 | a lower court found ban disproportionately aff... | a lower court found ban disproportionately aff... | [lower, court, found, ban, disproportionately,... | lower court found ban disproportionately affec... | [lower, court, found, ban, disproportionately,... |
| 6 | the ruling paves way states legalize sports be... | the ruling paves way states legalize sports be... | [ruling, paves, way, states, legalize, sports,... | ruling paves way states legalize sports bettin... | [ruling, pave, way, state, legalize, sport, be... |
| 7 | hillary clinton recently dismissed idea gettin... | hillary clinton recently dismissed idea gettin... | [hillary, clinton, recently, dismissed, idea, ... | hillary clinton recently dismissed idea gettin... | [hillary, clinton, recently, dismissed, idea, ... |
| 8 | like read below? sign huffpost hill get cheeky... | like read below? sign huffpost hill get cheeky... | [like, read, below, sign, huffpost, hill, get,... | like read below? sign huffpost hill get cheeky... | [like, read, below, sign, huffpost, hill, get,... |
valid_data.drop(columns=['headline_length', 'short_description_length', 'description_stop', 'description_tokenized','description_punc'])
| category | headline | authors | link | short_description | date | description_lemmatized | |
|---|---|---|---|---|---|---|---|
| 0 | POLITICS | HuffPost Rise: What You Need To Know On April 21 | NaN | https://www.huffingtonpost.com/entry/huffpost-... | welcome huffpost rise morning newsbrief, short... | 2016-04-21 | [welcome, huffpost, rise, morning, newsbrief, ... |
| 1 | POLITICS | How The Thomas Fire Could Affect An Already St... | Antonia Blumberg | https://www.huffingtonpost.com/entry/southern-... | "i don’t think ventura county well-positioned ... | 2017-12-12 | [, i, don, t, think, ventura, county, well, po... |
| 2 | SPORTS | Michael Phelps To U.S. Olympic Committee: Do S... | Jenna Amatulli | https://www.huffingtonpost.com/entry/michael-p... | most athletes experience post-olympics depress... | 2018-04-02 | [athlete, experience, post, olympics, depressi... |
| 3 | POLITICS | HUFFPOST HILL - Better Call Lanny | Eliot Nelson | https://www.huffingtonpost.com/entry/huffpost-... | nan | 2014-09-17 | [nan] |
| 4 | POLITICS | Trump Booed At Davos For Criticizing 'Fake' Media | Marina Fang | https://www.huffingtonpost.com/entry/trump-dav... | the president dismissed reports ordered firing... | 2018-01-26 | [president, dismissed, report, ordered, firing... |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1195 | SPORTS | NCAA Tournament Teams, Seeds 2014: March Madne... | NaN | https://www.huffingtonpost.com/entry/ncaa-tour... | shortly conference tournaments wrapped selecti... | 2014-03-16 | [shortly, conference, tournament, wrapped, sel... |
| 1196 | POLITICS | Obama Condemns 'Cynical' GOP Race Baiting In V... | Mollie Reilly | https://www.huffingtonpost.com/entry/obama-ral... | "i don’t think anybody really thinks somebody ... | 2017-10-20 | [, i, don, t, think, anybody, really, think, s... |
| 1197 | SPORTS | Happy Kid Dancing To 'Happy' At A Basketball G... | Lucy McCalmont | https://www.huffingtonpost.com/entry/kid-danci... | kid dances basketball game, amazing life. | 2015-03-27 | [kid, dance, basketball, game, amazing, life, ] |
| 1198 | POLITICS | Donald Trump's Labor Pick Would Be Expected To... | Dave Jamieson | https://www.huffingtonpost.com/entry/trumps-la... | hardee's agreed pay workers nearly $60,000 run... | 2016-12-09 | [hardee, s, agreed, pay, worker, nearly, 60, 0... |
| 1199 | POLITICS | HUFFPOLLSTER: Young Americans Heavily Favor Tr... | Natalie Jackson, Ariel Edwards-Levy, and Janie... | https://www.huffingtonpost.com/entry/transgend... | the public whole split, 18-29-year-olds much p... | 2016-04-21 | [public, whole, split, 18, 29, year, old, much... |
1200 rows × 7 columns
valid_data.to_csv('valid_Cleaned.csv')
# I stored train_cleaned and valid_cleaned file after transforamtion only keep description_lemmatized
# Load train.csv
train_cleandata = pd.read_csv('/Users/luqiansong/Desktop/train_Cleaned.csv')
# Load valid.csv
valid_cleandata = pd.read_csv('/Users/luqiansong/Desktop/valid_Cleaned.csv')
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# classifiers 1: logistic Regression:
# Select the 'short_description' columns as text features
text_features = ['description_lemmatized']
text_data = train_cleandata[text_features]
# Combine the text data into a single column
text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
# Create an instance of the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the combined text data
X_train_text = vectorizer.fit_transform(text_data['combined_text'])
# Convert the TF-IDF matrix to an array
X_train_text = X_train_text.toarray()
# Select the numerical features from the original dataset
numeric_features = [ 'short_description_length']
X_train_numeric = train_cleandata[numeric_features].values
# Concatenate the text and numerical features
X_train = np.concatenate((X_train_text, X_train_numeric), axis=1)
# Select the 'category' column as the target variable
y_train = train_cleandata['category']
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_69502/1467203526.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
test=pd.read_csv('/Users/luqiansong/Desktop/test.csv')
y_test=test['category']
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Create and train the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)
# Predict the target variable on the training data
y_train_pred = logistic_regression_model.predict(X_train)
# Calculate the training accuracy
training_accuracy = accuracy_score(y_train, y_train_pred)
# Display the training accuracy
print("Training Accuracy:", training_accuracy)
Training Accuracy: 0.8750749550269838
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
# Import the necessary library
from sklearn.metrics import confusion_matrix
# Predict the target variable on the training data
y_train_pred = logistic_regression_model.predict(X_train)
# Create the confusion matrix
conf_matrix = confusion_matrix(y_train, y_train_pred)
# Display the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)
Confusion Matrix: [[3812 6] [ 619 566]]
y_train_pred = logistic_regression_model.predict(X_train)
# Create the confusion matrix
conf_matrix = confusion_matrix(y_train, y_train_pred)
# Display the confusion matrix with a heatmap
print("Confusion Matrix:")
print(conf_matrix)
# Display the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix Heatmap")
plt.show()
Confusion Matrix: [[3812 6] [ 619 566]]
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
# Handling missing values by replacing them with zeros
train_cleandata.fillna(0, inplace=True)
# Select the 'description_lemmatized' columns as text features
text_features = ['description_lemmatized']
text_data = train_cleandata[text_features]
# Combine the text data into a single column
text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
# Create an instance of the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the combined text data
X_train_text = vectorizer.fit_transform(text_data['combined_text'])
# Convert the TF-IDF matrix to an array
X_train_text = X_train_text.toarray()
# Select the numerical features from the original dataset
numeric_features = ['short_description_length']
X_train_numeric = train_cleandata[numeric_features].values
# Concatenate the text and numerical features
X_train = np.concatenate((X_train_text, X_train_numeric), axis=1)
# Convert the target variable to categorical
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['category'])
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
# Predict on the training data
y_train_pred = rf_classifier.predict(X_train)
# Calculate accuracy on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
# Display the training accuracy
train_accuracy
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_10082/2101469619.py:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
0.9651785714285714
import os
import os
import numpy as np
import cv2
import random
import sklearn
import keras
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from keras import backend as K
from keras.layers import AveragePooling2D
from tensorflow.keras.optimizers import RMSprop
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPooling2D, Flatten
from keras.callbacks import ModelCheckpoint
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import tensorflow as tf
from keras.layers import Embedding
embedding_dim = 50
vocab_size=12482
max_length=150
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
Model: "sequential_5"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_3 (Embedding) (None, 150, 50) 624100
flatten_3 (Flatten) (None, 7500) 0
dense_10 (Dense) (None, 10) 75010
dense_11 (Dense) (None, 1) 11
=================================================================
Total params: 699,121
Trainable params: 699,121
Non-trainable params: 0
_________________________________________________________________
model.compile(optimizer=RMSprop(learning_rate=0.0001, decay=1e-6),
loss='binary_crossentropy',
metrics=['accuracy'])
import pandas as pd
# Load train.csv
train_data = pd.read_csv('/Users/luqiansong/Desktop/train_Cleaned.csv')
# Load valid.csv
valid_data = pd.read_csv('/Users/luqiansong/Desktop/valid_Cleaned.csv')
X_train=train_data['description_lemmatized']
y_train_wide=train_data['category']
X_valid=valid_data['description_lemmatized']
y_valid_wide=valid_data['category']
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_valid_seq = tokenizer.texts_to_sequences(X_valid)
# Pad sequences to a fixed length
max_length = 150
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_length)
# Rest
history = model.fit(X_train_pad, y_train, \
epochs=20, batch_size=32, verbose=1, \
validation_data=(X_valid_pad, y_valid))
Epoch 1/20 175/175 [==============================] - 1s 3ms/step - loss: 0.5791 - accuracy: 0.7487 - val_loss: 0.5874 - val_accuracy: 0.7333 Epoch 2/20 175/175 [==============================] - 0s 3ms/step - loss: 0.5602 - accuracy: 0.7523 - val_loss: 0.5813 - val_accuracy: 0.7333 Epoch 3/20 175/175 [==============================] - 0s 3ms/step - loss: 0.5547 - accuracy: 0.7523 - val_loss: 0.5749 - val_accuracy: 0.7333 Epoch 4/20 175/175 [==============================] - 0s 3ms/step - loss: 0.5495 - accuracy: 0.7523 - val_loss: 0.5713 - val_accuracy: 0.7333 Epoch 5/20 175/175 [==============================] - 0s 3ms/step - loss: 0.5427 - accuracy: 0.7523 - val_loss: 0.5650 - val_accuracy: 0.7333 Epoch 6/20 175/175 [==============================] - 0s 3ms/step - loss: 0.5338 - accuracy: 0.7523 - val_loss: 0.5586 - val_accuracy: 0.7333 Epoch 7/20 175/175 [==============================] - 0s 3ms/step - loss: 0.5235 - accuracy: 0.7523 - val_loss: 0.5520 - val_accuracy: 0.7333 Epoch 8/20 175/175 [==============================] - 1s 3ms/step - loss: 0.5114 - accuracy: 0.7523 - val_loss: 0.5419 - val_accuracy: 0.7333 Epoch 9/20 175/175 [==============================] - 1s 3ms/step - loss: 0.4963 - accuracy: 0.7523 - val_loss: 0.5274 - val_accuracy: 0.7333 Epoch 10/20 175/175 [==============================] - 1s 3ms/step - loss: 0.4786 - accuracy: 0.7523 - val_loss: 0.5119 - val_accuracy: 0.7333 Epoch 11/20 175/175 [==============================] - 0s 2ms/step - loss: 0.4578 - accuracy: 0.7529 - val_loss: 0.4958 - val_accuracy: 0.7333 Epoch 12/20 175/175 [==============================] - 0s 3ms/step - loss: 0.4344 - accuracy: 0.7561 - val_loss: 0.4816 - val_accuracy: 0.7358 Epoch 13/20 175/175 [==============================] - 1s 3ms/step - loss: 0.4106 - accuracy: 0.7754 - val_loss: 0.4581 - val_accuracy: 0.7550 Epoch 14/20 175/175 [==============================] - 1s 3ms/step - loss: 0.3866 - accuracy: 0.8007 - val_loss: 0.4468 - val_accuracy: 0.7567 Epoch 15/20 175/175 [==============================] - 1s 3ms/step - loss: 0.3654 - accuracy: 0.8229 - val_loss: 0.4305 - val_accuracy: 0.7700 Epoch 16/20 175/175 [==============================] - 1s 3ms/step - loss: 0.3451 - accuracy: 0.8473 - val_loss: 0.4143 - val_accuracy: 0.8000 Epoch 17/20 175/175 [==============================] - 0s 3ms/step - loss: 0.3261 - accuracy: 0.8652 - val_loss: 0.4055 - val_accuracy: 0.8008 Epoch 18/20 175/175 [==============================] - 0s 3ms/step - loss: 0.3089 - accuracy: 0.8784 - val_loss: 0.3950 - val_accuracy: 0.8075 Epoch 19/20 175/175 [==============================] - 0s 3ms/step - loss: 0.2943 - accuracy: 0.8873 - val_loss: 0.3864 - val_accuracy: 0.8167 Epoch 20/20 175/175 [==============================] - 0s 3ms/step - loss: 0.2811 - accuracy: 0.8929 - val_loss: 0.3799 - val_accuracy: 0.8242
# History saves the training into a dictionary structure with the keys below
history.history.keys()
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.ylim((-0.1, 1.1))
plt.show()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
CountVectorizer(stop_words='english')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
CountVectorizer(stop_words='english')
X_train_matrix = vectorizer.transform(X_train)
X_valid_matrix = vectorizer.transform(X_valid)
# Handling missing values by replacing them with zeros
train_cleandata.fillna(0, inplace=True)
# Select the 'description_lemmatized' columns as text features
text_features = ['description_lemmatized']
text_data = train_cleandata[text_features]
# Combine the text data into a single column
text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
# Create an instance of the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit the vectorizer to the combined text data
X_train_text = vectorizer.fit_transform(text_data['combined_text'])
# Convert the TF-IDF matrix to an array
X_train_text = X_train_text.toarray()
# Select the numerical features from the original dataset
numeric_features = ['short_description_length']
X_train_numeric = train_cleandata[numeric_features].values
# Concatenate the text and numerical features
X_train = np.concatenate((X_train_text, X_train_numeric), axis=1)
# Convert the target variable to categorical
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['category'])
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_69502/3337987493.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
KNeighborsClassifier(n_neighbors=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(n_neighbors=1)
sequential = KNeighborsClassifier(n_neighbors=1)
sequential.fit(X_train_matrix, y_train)
KNeighborsClassifier(n_neighbors=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(n_neighbors=1)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Change X and y to the dataset
X_cm = X_train_matrix
y_true_labels = y_train
# Convert the target variable to categorical
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['category'])
# Create and fit the KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=1)
knn_classifier.fit(X_train, y_train)
# Apply trained model to new dataset
y_pred = knn_classifier.predict(X_train)
# Print classification report
print(classification_report(y_train, y_pred))
# Create and plot the confusion matrix
cm = confusion_matrix(y_train, y_pred)
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix Training Set')
plt.show()
precision recall f1-score support
0 0.96 1.00 0.98 4213
1 1.00 0.86 0.92 1374
2 1.00 1.00 1.00 13
accuracy 0.97 5600
macro avg 0.98 0.95 0.97 5600
weighted avg 0.97 0.97 0.96 5600
# y_pred is a numpy array. It needs to be converted.
y_pred.shape
(5600,)
y_valid=valid_data['category']
print(len(X_valid))
print(len(y_valid))
print(len(y_pred))
1200 1200 5600
y_pred_valid = y_pred[:len(X_valid)]
y_pred_df = pd.DataFrame(y_pred_valid, index=X_valid.index, columns=['y_pred'])
valid_X_y = pd.concat([X_valid,y_valid,y_pred_df], axis=1)
valid_X_y
| description_lemmatized | category | y_pred | |
|---|---|---|---|
| 0 | ['welcome', 'huffpost', 'rise', 'morning', 'ne... | POLITICS | 1 |
| 1 | ['', 'i', 'don', 't', 'think', 'ventura', 'cou... | POLITICS | 0 |
| 2 | ['athlete', 'experience', 'post', 'olympics', ... | SPORTS | 0 |
| 3 | ['nan'] | POLITICS | 0 |
| 4 | ['president', 'dismissed', 'report', 'ordered'... | POLITICS | 1 |
| ... | ... | ... | ... |
| 1195 | ['shortly', 'conference', 'tournament', 'wrapp... | SPORTS | 1 |
| 1196 | ['', 'i', 'don', 't', 'think', 'anybody', 'rea... | POLITICS | 0 |
| 1197 | ['kid', 'dance', 'basketball', 'game', 'amazin... | SPORTS | 0 |
| 1198 | ['hardee', 's', 'agreed', 'pay', 'worker', 'ne... | POLITICS | 0 |
| 1199 | ['public', 'whole', 'split', '18', '29', 'year... | POLITICS | 0 |
1200 rows × 3 columns
# mistakes_valid for model = Sequential()
193+2
195
# mistakes_valid for logistic_regression_model
619+6
625
Got different error analysis for different model,found that the senqutial model is high accuracy
complete_data = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
data = complete_data[['headline','category']]
data
| headline | category | |
|---|---|---|
| 0 | Watch Highlights From The Last GOP Debate Befo... | POLITICS |
| 1 | Bob Costas And His Fedora Are 'Thursday Night ... | SPORTS |
| 2 | Hillary Clinton Met Privately With Elizabeth W... | POLITICS |
| 3 | An Open Letter to My Fellow Millennials on Hil... | POLITICS |
| 4 | Key California Lawmaker Steps Down Amid Harass... | POLITICS |
| ... | ... | ... |
| 7995 | Hillary Clinton’s Campaign, Cautious But Confi... | POLITICS |
| 7996 | What The Raiders’ Move To Vegas Means For The ... | SPORTS |
| 7997 | Giants Are World Series Champions! | SPORTS |
| 7998 | Trump To Lift Ban On Import Of Elephant Trophi... | POLITICS |
| 7999 | States Set Aside Millions Of Dollars For Crime... | POLITICS |
8000 rows × 2 columns
import pandas as pd
data.dropna(inplace=True)
X = data['headline']
y = data['category']
X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.30, train_size = 0.7, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size = 0.199/0.7, train_size = 0.5/0.7, stratify=y_train_plus_valid)
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py:311: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return func(*args, **kwargs)
type(X_valid)
pandas.core.series.Series
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
(3982,) (1586,) (2391,)
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
CountVectorizer(stop_words='english')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
CountVectorizer(stop_words='english')
X_train_matrix = vectorizer.transform(X_train)
X_valid_matrix = vectorizer.transform(X_valid)
X_test_matrix = vectorizer.transform(X_test)
print(X_train_matrix.shape)
print(X_valid_matrix.shape)
print(X_test_matrix.shape)
(3982, 7596) (1586, 7596) (2391, 7596)
df=data
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train_matrix, y_train)
KNeighborsClassifier(n_neighbors=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(n_neighbors=1)
# # Change X and y to the dataset
X_cm = X_train_matrix
y_true_labels = y_train
model = neigh
# Apply trained model to new dataset
y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))
cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix Training Set');
precision recall f1-score support
POLITICS 1.00 1.00 1.00 2985
SPORTS 1.00 1.00 1.00 997
accuracy 1.00 3982
macro avg 1.00 1.00 1.00 3982
weighted avg 1.00 1.00 1.00 3982
# Extract features (X) and labels (y)
X = data['headline']
y = data['category']
X
0 Watch Highlights From The Last GOP Debate Befo...
1 Bob Costas And His Fedora Are 'Thursday Night ...
2 Hillary Clinton Met Privately With Elizabeth W...
3 An Open Letter to My Fellow Millennials on Hil...
4 Key California Lawmaker Steps Down Amid Harass...
...
7995 Hillary Clinton’s Campaign, Cautious But Confi...
7996 What The Raiders’ Move To Vegas Means For The ...
7997 Giants Are World Series Champions!
7998 Trump To Lift Ban On Import Of Elephant Trophi...
7999 States Set Aside Millions Of Dollars For Crime...
Name: headline, Length: 7967, dtype: object
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
from sklearn.preprocessing import LabelEncoder
# Create a label encoder
label_encoder = LabelEncoder()
# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)
# Create and train the Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Make predictions on the test set
y_pred = logreg_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
Accuracy: 0.9109159347553325
Classification Report:
precision recall f1-score support
0 0.91 0.99 0.94 1223
1 0.94 0.66 0.78 371
accuracy 0.91 1594
macro avg 0.92 0.82 0.86 1594
weighted avg 0.91 0.91 0.91 1594
Confusion Matrix:
[[1206 17]
[ 125 246]]
# Create the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
# Create the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.title('Confusion Matrix')
plt.show()
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
from sklearn.preprocessing import LabelEncoder
# Create a label encoder
label_encoder = LabelEncoder()
# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
from sklearn.ensemble import RandomForestClassifie
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)
# Create and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Make predictions on the test set
y_pred = rf_model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Print the confusion matrix using heatmap
import seaborn as sns
import matplotlib.pyplot as plt
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.title('Confusion Matrix')
plt.show()
Accuracy: 0.9146800501882058
Classification Report:
precision recall f1-score support
0 0.92 0.97 0.95 1223
1 0.88 0.73 0.80 371
accuracy 0.91 1594
macro avg 0.90 0.85 0.87 1594
weighted avg 0.91 0.91 0.91 1594
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
from sklearn.preprocessing import LabelEncoder
# Create a label encoder
label_encoder = LabelEncoder()
# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
from tensorflow import keras
from tensorflow.keras import layers
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)
# Create the sequential model
model = keras.Sequential([
layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
layers.Dropout(0.5),
layers.Dense(32, activation='relu'),
layers.Dropout(0.5),
layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.2)
Epoch 1/10 160/160 [==============================] - 1s 2ms/step - loss: 0.5322 - accuracy: 0.7428 - val_loss: 0.4181 - val_accuracy: 0.7482 Epoch 2/10 160/160 [==============================] - 0s 2ms/step - loss: 0.3301 - accuracy: 0.8474 - val_loss: 0.2549 - val_accuracy: 0.8941 Epoch 3/10 160/160 [==============================] - 0s 2ms/step - loss: 0.2083 - accuracy: 0.9176 - val_loss: 0.2194 - val_accuracy: 0.9075 Epoch 4/10 160/160 [==============================] - 0s 2ms/step - loss: 0.1653 - accuracy: 0.9425 - val_loss: 0.2262 - val_accuracy: 0.9075 Epoch 5/10 160/160 [==============================] - 0s 1ms/step - loss: 0.1382 - accuracy: 0.9492 - val_loss: 0.2254 - val_accuracy: 0.9051 Epoch 6/10 160/160 [==============================] - 0s 1ms/step - loss: 0.1188 - accuracy: 0.9541 - val_loss: 0.2323 - val_accuracy: 0.9051 Epoch 7/10 160/160 [==============================] - 0s 1ms/step - loss: 0.1141 - accuracy: 0.9594 - val_loss: 0.2433 - val_accuracy: 0.9051 Epoch 8/10 160/160 [==============================] - 0s 1ms/step - loss: 0.0939 - accuracy: 0.9651 - val_loss: 0.2614 - val_accuracy: 0.9106 Epoch 9/10 160/160 [==============================] - 0s 2ms/step - loss: 0.0805 - accuracy: 0.9714 - val_loss: 0.2704 - val_accuracy: 0.9106 Epoch 10/10 160/160 [==============================] - 0s 2ms/step - loss: 0.0707 - accuracy: 0.9753 - val_loss: 0.2902 - val_accuracy: 0.9098
model.save("lateruse.h5")
# Evaluate the model
loss, accuracy = model.evaluate(X_test.toarray(), y_test)
print("Test Accuracy:", accuracy)
50/50 [==============================] - 0s 706us/step - loss: 0.2913 - accuracy: 0.9134 Test Accuracy: 0.9134253263473511
# History saves the training into a dictionary structure with the keys below
history.history.keys()
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.ylim((-0.1, 1.1))
plt.show()
import numpy as np
# Make predictions on the test set
y_pred_prob = model.predict(X_test.toarray())
y_pred = np.round(y_pred_prob).flatten().astype(int)
# Convert predictions back to categorical labels
y_pred_labels = label_encoder.inverse_transform(y_pred)
# Convert true labels back to categorical labels
y_true_labels = label_encoder.inverse_transform(y_test)
# Create the confusion matrix
conf_matrix = confusion_matrix(y_true_labels, y_pred_labels)
# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
50/50 [==============================] - 0s 658us/step
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.title('Confusion Matrix')
plt.show()
# the result is similar to the one i obtained for the validation set,
# compared to the short_description,the model of "headline",the accuray , TN, TP (true posivite )are higher.
# sequential model mistake valid
80+58
138
# logistic regression mistake valid
125+17
142
# randomforest mistake valid
99+37
136
import numpy as np
# Concatenate X_train and X_valid
X_train_valid = np.concatenate((X_train, X_valid), axis=0)
# Concatenate y_train and y_valid
y_train_valid = np.concatenate((y_train, y_valid), axis=0)
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
# Fit and transform the text data
X_train_valid_vectorized = vectorizer.fit_transform(X_train_valid)
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# Create and initialize the Logistic Regression model
logreg_model = LogisticRegression()
# Perform cross-validation and get the accuracy scores
cv_scores = cross_val_score(logreg_model, X_train_valid_vectorized, y_train_valid, cv=5)
# Print the cross-validation results
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))
print("Standard Deviation of CV Accuracy:", np.std(cv_scores))
Cross-Validation Scores: [0.89856373 0.89048474 0.89048474 0.88230009 0.89038634] Mean CV Accuracy: 0.8904439293416632 Standard Deviation of CV Accuracy: 0.005143150809612093
# acrossing cross validation, the model have achieved a reasonbale consistent accuracy
# reason:the mean cv accuracy indicated the overall performance is 0.89
# the small standard deviation suggests that model is relatively stable
import pandas as pd
data.dropna(inplace=True)
X = data['headline']
y = data['category']
X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.30, train_size = 0.7, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size = 0.199/0.7, train_size = 0.5/0.7, stratify=y_train_plus_valid)
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py:311: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return func(*args, **kwargs)
# View the first few rows of the test set data (X_test)
print("Test set data (X_test):")
print(X_test.head())
Test set data (X_test): 1403 Can You Spot Which Part Of This Tweet Is Of Qu... 1104 Confessions Of A Tonya Harding Apologist 7078 Donald Trump's Supreme Court Pick Came Of Age ... 7899 Week 16 Fantasy Football Focus 4175 Tom Brady Asks Why His Friendship With Donald ... Name: headline, dtype: object
# View the first few rows of the test set labels (y_test)
print("Test set labels (y_test):")
print(y_test.head())
Test set labels (y_test): 1403 SPORTS 1104 SPORTS 7078 POLITICS 7899 SPORTS 4175 SPORTS Name: category, dtype: object
# View the shape of the test set data and labels
print("Test set data shape:", X_test.shape)
print("Test set labels shape:", y_test.shape)
Test set data shape: (2391,) Test set labels shape: (2391,)
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
from sklearn.preprocessing import LabelEncoder
# Create a label encoder
label_encoder = LabelEncoder()
# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import load_model
# Load the model from the saved file
model = load_model("lateruse.h5")
model.load_weights("lateruse.h5")
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Create a tokenizer and fit it on the headlines in X_test
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_test)
# Convert text to sequences of integers
X_test_sequences = tokenizer.texts_to_sequences(X_test)
# Pad sequences to ensure they all have the same length
max_sequence_length = 1000
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)
# Load the model
model = keras.models.load_model('lateruse.h5')
# Make predictions on the test data
predictions = model.predict(X_test_padded)
# Convert probabilities to binary classes for binary classification
predicted_classes = (predictions > 0.5).astype('int32')
75/75 [==============================] - 0s 575us/step
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
# Convert y_test to binary classes for binary classification
y_test_binary = (y_test == 'SPORTS').astype('int32')
# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test_binary, predicted_classes)
# Visualize the confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
xticklabels=['POLOTICS', 'SPORTS'],
yticklabels=['POLOTICS', 'SPORTS'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
import numpy as np
# Concatenate X_train and X_valid
X_train_valid = np.concatenate((X_train, X_valid), axis=0)
# Concatenate y_train and y_valid
y_train_valid = np.concatenate((y_train, y_valid), axis=0)
print("Shape of X_train_valid:", X_train_valid.shape)
print("Shape of y_train_valid:", y_train_valid.shape)
Shape of X_train_valid: (5568,) Shape of y_train_valid: (5568,)
from sklearn.feature_extraction.text import TfidfVectorizer
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
from sklearn.preprocessing import LabelEncoder
# Create a label encoder
label_encoder = LabelEncoder()
# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
from tensorflow import keras
from tensorflow.keras import layers
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# Create the sequential model
model = keras.Sequential([
layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
layers.Dropout(0.5),
layers.Dense(32, activation='relu'),
layers.Dropout(0.5),
layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.2)
Epoch 1/10 128/128 [==============================] - 1s 2ms/step - loss: 0.5863 - accuracy: 0.7310 - val_loss: 0.5056 - val_accuracy: 0.7333 Epoch 2/10 128/128 [==============================] - 0s 1ms/step - loss: 0.4158 - accuracy: 0.7744 - val_loss: 0.3294 - val_accuracy: 0.8676 Epoch 3/10 128/128 [==============================] - 0s 2ms/step - loss: 0.2478 - accuracy: 0.9041 - val_loss: 0.2431 - val_accuracy: 0.8990 Epoch 4/10 128/128 [==============================] - 0s 2ms/step - loss: 0.1827 - accuracy: 0.9328 - val_loss: 0.2380 - val_accuracy: 0.9088 Epoch 5/10 128/128 [==============================] - 0s 1ms/step - loss: 0.1464 - accuracy: 0.9431 - val_loss: 0.2506 - val_accuracy: 0.9098 Epoch 6/10 128/128 [==============================] - 0s 2ms/step - loss: 0.1326 - accuracy: 0.9527 - val_loss: 0.2622 - val_accuracy: 0.9069 Epoch 7/10 128/128 [==============================] - 0s 2ms/step - loss: 0.1133 - accuracy: 0.9590 - val_loss: 0.2884 - val_accuracy: 0.9029 Epoch 8/10 128/128 [==============================] - 0s 2ms/step - loss: 0.0941 - accuracy: 0.9667 - val_loss: 0.3088 - val_accuracy: 0.9020 Epoch 9/10 128/128 [==============================] - 0s 1ms/step - loss: 0.0813 - accuracy: 0.9696 - val_loss: 0.3250 - val_accuracy: 0.9029 Epoch 10/10 128/128 [==============================] - 0s 2ms/step - loss: 0.0745 - accuracy: 0.9735 - val_loss: 0.3421 - val_accuracy: 0.9059
# Evaluate the model
loss, accuracy = model.evaluate(X_valid.toarray(), y_valid)
print("Test Accuracy:", accuracy)
40/40 [==============================] - 0s 723us/step - loss: 0.2468 - accuracy: 0.9169 Test Accuracy: 0.9168627262115479
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.ylim((-0.1, 1.1))
plt.show()
import numpy as np
# Make predictions on the test set
y_pred_prob = model.predict(X_test.toarray())
y_pred = np.round(y_pred_prob).flatten().astype(int)
# Convert predictions back to categorical labels
y_pred_labels = label_encoder.inverse_transform(y_pred)
# Convert true labels back to categorical labels
y_true_labels = label_encoder.inverse_transform(y_test)
# Create the confusion matrix
conf_matrix = confusion_matrix(y_true_labels, y_pred_labels)
# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
50/50 [==============================] - 0s 622us/step
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.title('Confusion Matrix')
plt.show()
# training the model wiht more data get higher accuracy
# the TP (true positive )adn TN is increasing and the FN AND FP is decreasing